import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import re
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
from scipy import stats
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import plotly
plotly.offline.init_notebook_mode()
def create_dict(keys, values):
dictionary = dict(zip(keys, values))
return dictionary
def grouping_pop(df, level, start_rename):
df_mean = df.groupby(level).mean().reset_index().dropna(axis = 1)
temp_dict = [create_dict(df_mean.columns[start_rename::], df_mean.columns[start_rename::]+ i) for i in ['_mean', '_std']]
df_mean = df_mean.rename(columns = temp_dict[0])
df_std = df.groupby(level).std().reset_index().dropna(axis = 1, thresh=5)
df_std= df_std.rename(columns = temp_dict[1])
df_n = df.groupby(level).size().reset_index(name='counts')
return df_mean, df_std, df_n
def plot_heatmap(cor, mode):
assert mode in ['cor','abs'], 'mode should be one of ["cor","abs"]'
if mode == 'cor':
vm = -1
else :
vm = 0
cor = np.abs(cor)
fig, ax = plt.subplots(figsize=(10, 8))# mask
mask = np.triu(np.ones_like(cor, dtype=np.bool))# adjust mask and df
mask = mask[1:, :-1]
corr = cor.iloc[1:,:-1].copy()# plot heatmap
sns.heatmap(corr, mask=mask, annot=False, fmt=".2f", cmap='Blues',
vmin=vm, vmax=1, cbar_kws={"shrink": .8})# yticks
plt.yticks(rotation=0)
plt.show()
def mean_confidence_interval(sm, n, confidence=0.95, verbose = False):
'''
compute the delta to mean value for computing confidence interval
sm : standard deviation of the mean
n : number of individuals
return the delta to mean value for computing confidence interval (to be added to the mean value)
'''
def compute_t(confidence, ni):
t=stats.t.ppf((1 + confidence) / 2., ni-1)
return t
tval = [compute_t(confidence, ni) for ni in n]
n = np.array(n)
sm = np.array(sm)
tval = np.array(tval)
h = np.array(sm)/np.sqrt(n) * np.array(tval)
if verbose:
print('t values : {}'.format(np.around(tval, decimals=2)))
print('\nse values : {}'.format(np.around(sm, decimals=2)))
print('\nci values : {}'.format(np.around(h, decimals=2)))
return h
# import df
df = pd.read_table("/home/xavier/Documents/research/FORMANRISK/data/data_formanrisk/individual_join.csv", sep = ";")
df = df.drop(columns = ["X","Y",'X_TYPE_', 'X_FREQ_', 'individual', 'branch_diam', 'branch_diamn','num'])
print('dimensions of df are \nnrows : {0}\nncols : {1}'.format(df.shape[0], df.shape[1]))
#Â remove the _15 from bioclim var
df.columns = [re.sub("_15", "", c) for c in df.columns]
# extracting index of bioclim var
bio_index = [i for i, item in enumerate(df.columns) if re.search('bio\d{1,2}', item)]
# renaming bioclim var with meaningful names
keys = ["bio1" ,"bio2" ,"bio3" ,"bio4" ,"bio5" ,"bio6" ,"bio7" ,"bio8" ,"bio9" ,"bio10" ,"bio11" ,"bio12" ,"bio13" ,"bio14" ,"bio15" ,"bio16" ,"bio17" ,"bio18" ,"bio19"]
values = ["Tmean_annual" ,"Mean_D_range" ,"Isothermality" ,"T_seasonality" ,"Tmax_warmerM" ,"Tmin_coldestM" ,"T_annual_range" ,"Tmean_wettestQ" ,"Tmean_driestQ" ,"Tmean_warmerQ" ,"Tmean_coldestQ" ,"P_annual" ,"P_wettestM" ,"P_driestM" ,"P_seasonality" ,"P_wettestQ" ,"P_driestQ" ,"P_warmestQ" ,"P_coldestQ"]
dictionary = create_dict(keys,values)
df = df.rename(columns = dictionary)
df_pop_mean,df_pop_std,df_pop_n = grouping_pop(df=df, level=['Species','site'], start_rename=2)
# extracting labels of columns of interest
label_num = df_pop_mean.iloc[:,2::].columns
df_pop_mean = pd.concat([df_pop_mean,df_pop_std,df_pop_n], axis = 1)
df_pop_mean =df_pop_mean.loc[:,~df_pop_mean.columns.duplicated()]
df_pop_mean_T,df_pop_std_T ,df_pop_n_T = grouping_pop(df=df, level=['Species','site','Treatment'], start_rename=3)
df_pop_mean_T = pd.concat([df_pop_mean_T ,df_pop_std_T ,df_pop_n_T ], axis = 1)
df_pop_mean_T =df_pop_mean_T.loc[:,~df_pop_mean_T.columns.duplicated()]
df_pp = df[df.Species == "pinus pinaster"]
df_mean_pp = df_pop_mean[df_pop_mean.Species == "pinus pinaster"]
df_mean_pp_T = df_pop_mean_T[df_pop_mean_T.Species == "pinus pinaster"]
fig = px.histogram(df_pp, x="P50", color = 'Treatment', marginal="rug",
hover_data=df_pp.columns)
fig.show()
fig = px.box(df_pp, x="site", y="P50", color = 'Treatment')
fig.show()
mci = mean_confidence_interval(sm = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "P50_std"],
n = df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "counts"],
verbose = False)
fig = px.scatter(x=df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='young', "P50_mean"],
y=df_mean_pp_T.loc[df_mean_pp_T['Treatment']=='adult', "P50_mean"],
trendline="ols",
error_y=mci,
error_y_minus=mci,
labels={
"x": "P50 mean young",
"y": "P50 Mean adult"
},
title="adult P50 vs young P50 per population with 95% confidence interval")
fig.show()
df_corr= df_mean_pp[label_num].corr()
plot_heatmap(cor=df_corr, mode='abs')
import plotly.express as px
fig = px.scatter(df_pp, x="Tmean_annual", y="P50", color="Treatment", trendline="ols", title = 'P50 vs Tmean')
fig.show()
results = px.get_trendline_results(fig)
print(results)
# results.query("Treatment == 'adult'").px_fit_results.iloc[0].summary()
# results.query("Treatment == 'young'").px_fit_results.iloc[0].summary()
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(df_mean_pp['Tmean_annual_mean']/10, df_mean_pp['P50_mean'].values)
for i, txt in enumerate(df_mean_pp['site'].values):
ax.annotate(txt, (df_mean_pp['Tmean_annual_mean'][i]/10,df_mean_pp['P50_mean'].values[i]))
plt.ylabel("P50")
plt.xlabel("T annual mean")
plt.show()
fig, ax = plt.subplots(figsize=(6, 6))
ax.scatter(df_mean_pp['Tmean_coldestQ_mean']/10, df_mean_pp['P50_mean'].values)
for i, txt in enumerate(df_mean_pp['site'].values):
ax.annotate(txt, (df_mean_pp['Tmean_coldestQ_mean'][i]/10,df_mean_pp['P50_mean'].values[i]))
plt.ylabel("P50")
plt.xlabel("T mean coldest quarter")
plt.show()
import plotly.express as px
fig = px.scatter(df_mean_pp_T, x="Tmean_annual_mean", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
results = px.get_trendline_results(fig)
print(results)
import plotly.express as px
fig = px.scatter(df_mean_pp_T, x="P_driestQ_mean", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
import plotly.express as px
fig = px.scatter(df_mean_pp_T, x="Tmean_warmerQ_mean", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
import plotly.express as px
fig = px.scatter(df_mean_pp_T, x="Tmin_coldestM_mean", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
import plotly.express as px
fig = px.scatter(df_mean_pp_T, x="Tmean_coldestQ_mean", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
#instanciations
sc = StandardScaler()
#transformation–centrage-réduction
Z = sc.fit_transform(df_mean_pp_T[[v +'_mean' for v in values]])
#instanciation
acp = PCA(svd_solver='full')
#calculs
coord = acp.fit_transform(Z)
#nombre de composantes calculées
print(acp.n_components_)
eigval = acp.explained_variance_
#nombre d'observations
n = Z.shape[0]
#nombre de variables
p = Z.shape[1]
print(acp.explained_variance_)
plt.plot(np.arange(1,len(acp.explained_variance_)+1),acp.explained_variance_)
plt.title("Scree plot")
plt.ylabel("Eigen values")
plt.xlabel("Factor number")
plt.show()
plt.plot(np.arange(1,len(acp.explained_variance_)+1),np.cumsum(acp.explained_variance_ratio_))
plt.title("Explained variance vs. # of factors")
plt.ylabel("Cumsum explained varianceratio")
plt.xlabel("Factor number")
plt.show()
#positionnement des individus dans lepremier plan
fig, axes = plt.subplots(figsize=(12,12))
axes.set_xlim(-6,6) #même limites en abscisse
axes.set_ylim(-6,6) #et en ordonnée
#placement des étiquettes des observations
for i in range(n):
plt.annotate(df_mean_pp_T.site[i],(coord[i,0],coord[i,1]))
#ajouter les axes
plt.plot([-6,6],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-6,6],color='silver',linestyle='-',linewidth=1)#affichage
plt.show()
# racine carrée des valeurs propres
sqrt_eigval = np.sqrt(eigval)
#corrélation des variables avec les axes
corvar = np.zeros((p,p))
for k in range(p):
corvar[:,k] = acp.components_[k,:] * sqrt_eigval[k]
#afficher la matrice des corrélations variables x facteurs
# print(corvar)
#cercle des corrélations
fig, axes = plt.subplots(figsize=(8,8))
axes.set_xlim(-1,1)
axes.set_ylim(-1,1)
#affichage des étiquettes (noms des variables)
for j in range(p):
plt.annotate(df_mean_pp_T[[v +'_mean' for v in values]].columns[j],(corvar[j,0],corvar[j,1]))
#ajouter les axes
plt.plot([-1,1],[0,0],color='silver',linestyle='-',linewidth=1)
plt.plot([0,0],[-1,1],color='silver',linestyle='-',linewidth=1)
#ajouter un cercle
cercle = plt.Circle((0,0),1,color='blue',fill=False)
axes.add_artist(cercle)
#affichage
plt.show()
acp_coord = pd.DataFrame(coord, columns = ['acp_'+str(i) for i in np.arange(0,coord.shape[1])])
df_mean_pp_T_acp = pd.concat([df_mean_pp_T, acp_coord], axis = 1)
fig = px.scatter(df_mean_pp_T_acp, x="acp_0", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()
acp_coord = pd.DataFrame(coord, columns = ['acp_'+str(i) for i in np.arange(0,coord.shape[1])])
df_mean_pp_T_acp = pd.concat([df_mean_pp_T, acp_coord], axis = 1)
fig = px.scatter(df_mean_pp_T_acp, x="acp_1", y="P50_mean", color="Treatment", trendline="ols",
text = "site")
fig.update_traces(textposition='top center')
fig.show()